HW 03

INFO 526 Summer 2025

Author

Nathaniel Cross

0 - Setup

# Install and load packages
if(!require(pacman))
  install.packages("pacman")

pacman::p_load(tidyverse, 
               glue,
               scales,
               countdown,
               ggthemes,
               gt,
               palmerpenguins,
               openintro,
               ggrepel,
               patchwork,
               quantreg,
               janitor,
               colorspace,
               broom,
               fs,
               here,
               openintro,
               gghighlight,
               lubridate,
               dsbox,
               ggridges,
               gtable,
               ggimage,
               png,
               ggpubr
               )

devtools::install_github("tidyverse/dsbox")

# Set theme for ggplot2
ggplot2::theme_set(ggplot2::theme_minimal(base_size = 14))

# Set width of code output
options(width = 65)

# Set figure parameters for knitr
knitr::opts_chunk$set(
  fig.width = 7,        # 7" width
  fig.asp = 0.618,      # the golden ratio
  fig.retina = 3,       # dpi multiplier for displaying HTML output on retina
  fig.align = "center", # center align figures
  dpi = 300             # higher dpi, sharper image
)

1 - Du Bois challenge.

# Loading in the data
income <- read_csv("data/income.csv")

income |>
  glimpse()
Rows: 7
Columns: 7
$ Class          <chr> "$100-200", "$200-300", "$300-400", "$40…
$ Average_Income <dbl> 139.10, 249.45, 335.66, 433.82, 547.00, …
$ Rent           <dbl> 19, 22, 23, 18, 13, 0, 0
$ Food           <dbl> 43, 47, 43, 37, 31, 37, 29
$ Clothes        <dbl> 28, 23, 18, 15, 17, 19, 16
$ Tax            <dbl> 9.9, 4.0, 4.5, 5.5, 5.0, 8.0, 4.5
$ Other          <dbl> 0.1, 4.0, 11.5, 24.5, 34.0, 36.0, 50.5
# Correcting the data
income[1, 6] = 0.1
income[1, 7] = 9.9 # Source: Marcel Hebing, StackOverflow

# Pivoting the data long
income_clean <- income |>
  pivot_longer(
    cols = c("Rent", "Food", "Clothes", "Tax", "Other"), 
    names_to = "category", 
    values_to = "expenditure"
  ) |>
  glimpse()
Rows: 35
Columns: 4
$ Class          <chr> "$100-200", "$100-200", "$100-200", "$10…
$ Average_Income <dbl> 139.10, 139.10, 139.10, 139.10, 139.10, …
$ category       <chr> "Rent", "Food", "Clothes", "Tax", "Other…
$ expenditure    <dbl> 19.0, 43.0, 28.0, 0.1, 9.9, 22.0, 47.0, …
# Set image
image <- "https://cdn.pixabay.com/photo/2012/12/06/06/27/paper-68829_1280.jpg"

# Releveling factor variables
income_clean <- income_clean |>
  mutate(
    Class = fct_relevel(Class, "$100-200", "$200-300", "$300-400", "$400-500", "$500-750", "$750-1000", "$1000 AND OVER"),
    category = fct_relevel(category, "Rent", "Food", "Clothes", "Tax", "Other"),
    Class = fct_rev(Class),
    category = fct_rev(category)
  )

# Creating cumulative summations
income_clean <- income_clean |>
  group_by(Class) |>
  mutate(label_y = cumsum(expenditure) - 0.5 * expenditure) |>
  ungroup() # Source: R Graphics Codebook


# Prepping labels
income_clean2 <- income_clean|>
  filter(category != "Rent" & category != "Tax")

income_clean2 <- income_clean2 |>
  mutate(
    perc = "%",
    expend2 = glue("{expenditure}{perc}")
  ) 
  
# Plotting
plot <- income_clean |>
  ggplot(aes(x = Class, y = expenditure, fill = category)) +
  geom_col(width = 0.5) +# Source: Geeks for Geeks (https://www.geeksforgeeks.org/r-language/grouped-stacked-and-percent-stacked-barplot-in-ggplot2/) +
    geom_text(data = income_clean2, aes(y = label_y, label = expend2), color = "black", size = 3, family = "mono") + 
  labs(
    y = NULL,
    x = NULL,
    title = "INCOME AND EXPENDITURE OF 150 NEGRO FAMILIES IN ATLANTA, GA., U.S.A."
  ) +
   annotate(
    geom = "text",
    x = c("$100-200", "$200-300", "$300-400", "$400-500", "$500-750"),
    y = c(9.5, 11, 11.5, 9, 6.5),
    label = c("19%", "22%", "23%", "18%", "13%"),
    size = 3,
    color = "white",
    family = "mono"
  ) +
   annotate(
    geom = "text",
    x = c("$200-300", "$300-400", "$400-500", "$500-750", "$750-1000", "$1000 AND OVER"),
    y = c(94, 86.25, 72.75, 63.5, 60, 47.25),
    label = c("4%", "4.5%", "5.5%", "5%", "8%", "4.5%"),
    size = 2,
    color = "black",
    family = "mono"
  ) +
  annotate(
    geom = "text",
    x = "$100-200",
    y = 0,
    label = "CLASS    ACTUAL AVERAGE                                   \n\n\n",
    size = 2,
    color = "black",
    family = "mono"
  ) +
   scale_x_discrete(labels = c("1,000     $1,125 \nAND OVER          ", "$750-1000   $880   ", "$500-750   $547   ", "$400-500   $433.82", "$300-400   $335.66", "$200-300   $249.45", "$100-200   $139.10")) +
  coord_flip(clip = "off") +
  scale_fill_manual(breaks = c('Rent', 'Food', 'Clothes', 'Tax', 'Other'), values = c("black", "purple", "sienna1", "slategray1", "snow2")) + # Source: https://www.statology.org/ggplot-legend-order/
   theme(
    legend.position = "top",
    plot.title.position = "plot",
    plot.title = element_text(size = 10, hjust = 0.5, face = "bold"),
    panel.grid = element_blank(),
    axis.text.x = element_blank(),
    axis.text.y = element_text(size =7, color = "black"),
    text = element_text(family = "mono"),
    legend.title = element_blank(), 
    legend.text  = element_text(size = 9),
    legend.key.size = unit(0.3, "cm") # Source: https://www.tidyverse.org/blog/2024/02/ggplot2-3-5-0-legends/
  )
  
ggbackground(plot, image) # Source: Guangchuang Yu (https://guangchuangyu.github.io/2018/04/setting-ggplot2-background-with-ggbackground/)

2 - COVID survey - interpret

The visualization deconstructs responses to six statements regarding perceptions of the COVID-19 vaccine and its safety. Responses are measured on a 1-5 Likert scale, with the means depicted in the figure. Error bars are also depicted from the 10th to 90th percentiles of responses per question. Responses are shown on a macro-level first, then classified by identity, such as age, gender, and race (among others). For each statement, there is minor variation by identity group in mean response on the Likert scale; the interesting part of this figure is derived from its depiction of the distributions of each response, which are far less homogeneous. As the surveyed respondents are nursing and medical university students across the United States, most responses indicate high trust in the vaccine and its safety. This holds true for the statements, “I believe the vaccine is safe,” “Getting the vaccine makes me feel safer at work,” “I am confident in the scientific vetting process for new vaccines,” “I trust the information I have received,” and “I will recommend the vaccine to others” (abridged for brevity), where all respondents express agreement with the statement (strong or somewhat agree). Less agreement is introduced in response to the statement, “I am concerned about side effects,” where agreement is neutral (neither agree nor disagree).

Some of the most intriguing results for me in this plot are located among identity groups with heterogeneous variation in responses between statements. For example, there is high variation in responses to the statement “I believe the vaccine is safe” among Asians (with the 10-90th percentiles encompassing almost the entire scale), however this variation is almost completely eliminated when the same group was asked if they would recommend the vaccine to others. I would have expected similar variation in response to these questions because if one does not find the vaccine safe, I would expect them to be less likely to recommend it. I also find it interesting in comparing variation between statements at the macro level, regardless of identity group. More particularly, in almost every identity group there is high heterogeneity in responses to the “I am concerned about side effects” statement, however much less variation in “I will recommend the vaccine to others” statement. This could be indicative of a stronger sense of community in light of a global pandemic (dependent on when the data were collected). Finally, I find a comparison of nursing and medical students interesting, especially when comparing the distributions of their responses. As both are in the medical field literally learning about the processes involved in designing and selling vaccines, I would have expected the distribution of these responses to be similar, however for all statements but the first and third, the distributions are larger for medical students than nurses. While this difference may not be statistically significant, is makes an interesting comparison and begs the questions as to why.

3 - COVID survey - reconstruct

# Loading in data
covid <- read_csv("data/covid-survey.csv")

# Renaming columns to row 1
colnames(covid) <- covid[1,] # Source: https://stackoverflow.com/questions/32054368/use-first-row-data-as-column-names-in-r

# Deleting row 1 (now column names)
remove.rows <- 1
covid <- covid[!(row.names(covid) %in% remove.rows),] # Source: https://www.tutorialspoint.com/how-to-remove-rows-in-an-r-data-frame-using-row-names

# Dimensions of dataset
dim(covid)
[1] 1121   14
# Eliminate rows with all NAs but response ID
covid <- covid |>
  filter(!if_all(-response_id, is.na))

# Dimensions of the dataset
dim(covid)
[1] 1111   14
# Relabel values of columns
covid |>
  glimpse()
Rows: 1,111
Columns: 14
$ response_id             <chr> "1", "2", "4", "5", "6", "7", "…
$ exp_profession          <chr> "1", "1", "1", "1", "1", "1", "…
$ exp_flu_vax             <chr> "1", "1", "1", "1", "1", "1", "…
$ exp_gender              <chr> "0", "1", "0", "0", "1", "1", "…
$ exp_race                <chr> "2", "2", "5", "5", "5", "5", "…
$ exp_ethnicity           <chr> "2", "2", "2", "2", "2", "2", "…
$ exp_age_bin             <chr> "25", "20", "25", "25", "25", "…
$ exp_already_vax         <chr> "1", "1", "1", "1", "1", "1", "…
$ resp_safety             <chr> "5", "5", "5", "5", "5", "5", "…
$ resp_confidence_science <chr> "2", "1", "1", "1", "1", "1", "…
$ resp_concern_safety     <chr> "2", "1", "1", "1", "1", "1", "…
$ resp_feel_safe_at_work  <chr> "1", "1", "1", "1", "1", "1", "…
$ resp_will_recommend     <chr> "1", "1", "1", "1", "1", "1", "…
$ resp_trust_info         <chr> "1", "1", "1", "1", "1", "2", "…
covid <- covid |>
    mutate(
      exp_already_vax = recode(exp_already_vax, "0" = 'No', "1" = 'Yes'),      
      exp_flu_vax = recode(exp_flu_vax, "0" = 'No', "1" = 'Yes'),
      exp_profession = recode(exp_profession, "0" = 'Medical', "1" = 'Nursing'),
      exp_gender = recode(exp_gender, "0" = 'Male', "1" = 'Female', "3" = 'Non-binary third gender', "4" = 'Prefer not to say'),
      exp_race = recode(exp_race, "1" = 'American Indian/Alaskan Native', "2" = 'Asian', "3" = 'Black/African American', "4" = 'Native Hawaiian/Other Pacific Islander', "5" = 'White'),
      exp_ethnicity = recode(exp_ethnicity, "1" = 'Hispanic/Latino', "2" = 'Non-Hispanic/Non-Latino'),
      exp_age_bin = recode(exp_age_bin, "0" = '<20', "20" = '21-25', "25" = '26-30', "30" = '>30')
    )

# Dimensions of dataset
dim(covid)
[1] 1111   14
# Calculating mean and percentiles
covid_survey_longer <- covid |>
  pivot_longer(
    cols = starts_with("exp_"),
    names_to = "explanatory",
    values_to = "explanatory_value"
  ) |> # This pivot lengthens the data so that instead of one column for each explanatory variable, there are instead multiple observations of each response, one per explanatory variable. The values of these explanatory variables were pivoted into a new column, explanatory_value.
  filter(!is.na(explanatory_value)) |> # Eliminates NAs from explanatory values.
  pivot_longer(
    cols = starts_with("resp_"),
    names_to = "response",
    values_to = "response_value"
  ) # This pivot again lengthens the data, this time for response variables. This means that for each response variable for each response ID, explanatory variables and their values are listed.

# Display tibble
covid_survey_longer
# A tibble: 43,428 × 5
   response_id explanatory    explanatory_value response         
   <chr>       <chr>          <chr>             <chr>            
 1 1           exp_profession Nursing           resp_safety      
 2 1           exp_profession Nursing           resp_confidence_…
 3 1           exp_profession Nursing           resp_concern_saf…
 4 1           exp_profession Nursing           resp_feel_safe_a…
 5 1           exp_profession Nursing           resp_will_recomm…
 6 1           exp_profession Nursing           resp_trust_info  
 7 1           exp_flu_vax    Yes               resp_safety      
 8 1           exp_flu_vax    Yes               resp_confidence_…
 9 1           exp_flu_vax    Yes               resp_concern_saf…
10 1           exp_flu_vax    Yes               resp_feel_safe_a…
# ℹ 43,418 more rows
# ℹ 1 more variable: response_value <chr>
# Destring response_value
covid_survey_longer$response_value <- as.numeric(as.character(covid_survey_longer$response_value)) # Source: Geeks for Geeks (https://www.geeksforgeeks.org/r-language/convert-data-frame-column-to-numeric-in-r/)

sapply(covid_survey_longer, class)
      response_id       explanatory explanatory_value 
      "character"       "character"       "character" 
         response    response_value 
      "character"         "numeric" 
# Calculating summary statistics for identity groups
covid_survey_summary_stats_by_group <- covid_survey_longer |>
  group_by(explanatory, explanatory_value, response) |>
  summarise(mean = mean(response_value, na.rm = TRUE),
            low = quantile(response_value, 0.1, na.rm = TRUE),
            high = quantile(response_value, 0.9, na.rm = TRUE) # Source: https://tbradley1013.github.io/2018/10/01/calculating-quantiles-for-groups-with-dplyr-summarize-and-purrr-partial/
            )

# Display tibble
covid_survey_summary_stats_by_group
# A tibble: 126 × 6
# Groups:   explanatory, explanatory_value [21]
   explanatory explanatory_value response        mean   low  high
   <chr>       <chr>             <chr>          <dbl> <dbl> <dbl>
 1 exp_age_bin 21-25             resp_concern_…  3.32     2     5
 2 exp_age_bin 21-25             resp_confiden…  1.31     1     2
 3 exp_age_bin 21-25             resp_feel_saf…  1.20     1     2
 4 exp_age_bin 21-25             resp_safety     1.95     1     5
 5 exp_age_bin 21-25             resp_trust_in…  1.29     1     2
 6 exp_age_bin 21-25             resp_will_rec…  1.09     1     1
 7 exp_age_bin 26-30             resp_concern_…  3.35     1     5
 8 exp_age_bin 26-30             resp_confiden…  1.40     1     2
 9 exp_age_bin 26-30             resp_feel_saf…  1.29     1     2
10 exp_age_bin 26-30             resp_safety     2.16     1     5
# ℹ 116 more rows
# Calculating summary statistics for all
covid_survey_summary_stats_all <- covid_survey_longer |>
  group_by(response) |>
  summarise(mean = mean(response_value, na.rm = TRUE),
            low = quantile(response_value, 0.1, na.rm = TRUE),
            high = quantile(response_value, 0.9, na.rm = TRUE)
            ) |>
  mutate(
    explanatory = "All",
    explanatory_value = ""
  )

covid_survey_summary_stats_all$explanatory_value <- as.factor(as.character(covid_survey_summary_stats_all$explanatory_value))

sapply(covid_survey_summary_stats_all, class)
         response              mean               low 
      "character"         "numeric"         "numeric" 
             high       explanatory explanatory_value 
        "numeric"       "character"          "factor" 
# Display tibble
covid_survey_summary_stats_all
# A tibble: 6 × 6
  response         mean   low  high explanatory explanatory_value
  <chr>           <dbl> <dbl> <dbl> <chr>       <fct>            
1 resp_concern_s…  3.28     1     5 All         ""               
2 resp_confidenc…  1.43     1     2 All         ""               
3 resp_feel_safe…  1.36     1     2 All         ""               
4 resp_safety      2.03     1     5 All         ""               
5 resp_trust_info  1.40     1     2 All         ""               
6 resp_will_reco…  1.21     1     2 All         ""               
# Bind the two summary statistic dataframes together
covid_survey_summary_stats <- bind_rows(covid_survey_summary_stats_all, covid_survey_summary_stats_by_group)

# Display tibble
covid_survey_summary_stats
# A tibble: 132 × 6
   response        mean   low  high explanatory explanatory_value
   <chr>          <dbl> <dbl> <dbl> <chr>       <chr>            
 1 resp_concern_…  3.28     1     5 All         ""               
 2 resp_confiden…  1.43     1     2 All         ""               
 3 resp_feel_saf…  1.36     1     2 All         ""               
 4 resp_safety     2.03     1     5 All         ""               
 5 resp_trust_in…  1.40     1     2 All         ""               
 6 resp_will_rec…  1.21     1     2 All         ""               
 7 resp_concern_…  3.32     2     5 exp_age_bin "21-25"          
 8 resp_confiden…  1.31     1     2 exp_age_bin "21-25"          
 9 resp_feel_saf…  1.20     1     2 exp_age_bin "21-25"          
10 resp_safety     1.95     1     5 exp_age_bin "21-25"          
# ℹ 122 more rows
# Relevel factors
covid_survey_summary_stats <- covid_survey_summary_stats |>
  mutate(
      explanatory_value = fct_rev(fct_relevel(explanatory_value, ">30", "26-30", "21-25", "<20")),
      explanatory = fct_relevel(explanatory, "All", "exp_age_bin", "exp_gender", "exp_race", "exp_ethnicity", "exp_profession", "exp_already_vax", "exp_flu_vax"),
      explanatory_value = fct_relevel(explanatory_value, "Female", "Male", "Non-binary third gender", "Prefer not to say"),
      explanatory_value = fct_relevel(explanatory_value, "American Indian/Alaskan Native", "Asian", "Black/African American", "Native Hawaiian/Other Pacific Islander", "White"),
      explanatory_value = fct_relevel(explanatory_value, "Non-Hispanic/Non-Latino", "Hispanic/Latino"),
      explanatory_value = fct_relevel(explanatory_value, "Medical", "Nursing"),
      explanatory_value = fct_relevel(explanatory_value, "No", "Yes")
  )

# Set labels
covid_survey_summary_stats <- covid_survey_summary_stats |>
    mutate(
      explanatory = recode(explanatory, "exp_age_bin" = 'Age', "exp_gender" = 'Gender', "exp_race" = 'Race', "exp_ethnicity" = 'Ethnicity', "exp_profession" = 'Profession', "exp_already_vax" = 'Had COVID vaccine', "exp_flu_vax" = 'Had flu vaccine this year'),
      response = fct_relevel(response, "resp_safety", "resp_feel_safe_at_work", "resp_concern_safety", "resp_confidence_science",   "resp_trust_info", "resp_will_recommend"),
      response = recode(response, 
                        "resp_safety" = 'Based on my understanding, I believe the vaccine is safe', 
                        "resp_confidence_science" = 'I am confident in the scientific vetting process for the new COVID vaccines', 
                        "resp_feel_safe_at_work" = 'Getting the vaccine will make me feel safer at work', 
                        "resp_will_recommend" = 'I will recommend the vaccine to family, friends, and community members', 
                        "resp_trust_info" = 'I trust the information that I have received about the vaccines',
                        "resp_concern_safety" = 'I am concerned about the safety and side effects of the vaccine')
    )

# Creating the plot
covid_survey_summary_stats |>
ggplot(aes(x = mean, y = factor(explanatory_value))) +
  geom_point(size = 0.75) +
  geom_errorbarh(aes(xmin = low, xmax = high), height = 0.3) +
  facet_grid(
    rows = vars(explanatory), 
    cols = vars(response), 
    scales = "free_y",
    space = "free_y", 
    labeller = labeller(explanatory = label_wrap_gen(15), response = label_wrap_gen(15)) 
    ) +
  scale_x_continuous(breaks = 1:5) +
  labs(
    x = "Mean Likert score\n(Error bars range from 10th to 90th percentile)",
    y = NULL,
  ) +
  theme(
    strip.text = element_text(size = 6),
    strip.text.y = element_text(angle = 0),
    axis.text.y = element_text(size = 6),
    axis.text.x = element_text(size = 6),
    panel.spacing = unit(0, "lines"),
    panel.spacing.x = unit(0.3, "lines"),
    axis.title.x = element_text(size = 8),
    panel.grid = element_blank(),
    strip.background = element_rect(fill = "gray90", color = "black")
  )

4 - COVID survey - re-reconstruct

# Calculating summary statistics for identity groups
covid_survey_summary_stats_by_group2 <- covid_survey_longer |>
  group_by(explanatory, explanatory_value, response) |>
  summarise(mean = mean(response_value, na.rm = TRUE),
            low = quantile(response_value, 0.25, na.rm = TRUE),
            high = quantile(response_value, 0.75, na.rm = TRUE) # Source: https://tbradley1013.github.io/2018/10/01/calculating-quantiles-for-groups-with-dplyr-summarize-and-purrr-partial/
            )

# Calculating summary statistics for all
covid_survey_summary_stats_all2 <- covid_survey_longer |>
  group_by(response) |>
  summarise(mean = mean(response_value, na.rm = TRUE),
            low = quantile(response_value, 0.25, na.rm = TRUE),
            high = quantile(response_value, 0.75, na.rm = TRUE)
            ) |>
  mutate(
    explanatory = "All",
    explanatory_value = ""
  )

covid_survey_summary_stats_all2$explanatory_value <- as.factor(as.character(covid_survey_summary_stats_all2$explanatory_value))

sapply(covid_survey_summary_stats_all2, class)
         response              mean               low 
      "character"         "numeric"         "numeric" 
             high       explanatory explanatory_value 
        "numeric"       "character"          "factor" 
# Bind the two summary statistic dataframes together
covid_survey_summary_stats2 <- bind_rows(covid_survey_summary_stats_all2, covid_survey_summary_stats_by_group2)

# Display tibble
covid_survey_summary_stats2
# A tibble: 132 × 6
   response        mean   low  high explanatory explanatory_value
   <chr>          <dbl> <dbl> <dbl> <chr>       <chr>            
 1 resp_concern_…  3.28     2     4 All         ""               
 2 resp_confiden…  1.43     1     2 All         ""               
 3 resp_feel_saf…  1.36     1     1 All         ""               
 4 resp_safety     2.03     1     3 All         ""               
 5 resp_trust_in…  1.40     1     2 All         ""               
 6 resp_will_rec…  1.21     1     1 All         ""               
 7 resp_concern_…  3.32     2     4 exp_age_bin "21-25"          
 8 resp_confiden…  1.31     1     2 exp_age_bin "21-25"          
 9 resp_feel_saf…  1.20     1     1 exp_age_bin "21-25"          
10 resp_safety     1.95     1     2 exp_age_bin "21-25"          
# ℹ 122 more rows
# Relevel factors
covid_survey_summary_stats2 <- covid_survey_summary_stats2 |>
  mutate(
      explanatory_value = fct_rev(fct_relevel(explanatory_value, ">30", "26-30", "21-25", "<20")),
      explanatory = fct_relevel(explanatory, "All", "exp_age_bin", "exp_gender", "exp_race", "exp_ethnicity", "exp_profession", "exp_already_vax", "exp_flu_vax"),
      explanatory_value = fct_relevel(explanatory_value, "Female", "Male", "Non-binary third gender", "Prefer not to say"),
      explanatory_value = fct_relevel(explanatory_value, "American Indian/Alaskan Native", "Asian", "Black/African American", "Native Hawaiian/Other Pacific Islander", "White"),
      explanatory_value = fct_relevel(explanatory_value, "Non-Hispanic/Non-Latino", "Hispanic/Latino"),
      explanatory_value = fct_relevel(explanatory_value, "Medical", "Nursing"),
      explanatory_value = fct_relevel(explanatory_value, "No", "Yes")
  )

# Set labels
covid_survey_summary_stats2 <- covid_survey_summary_stats2 |>
    mutate(
      explanatory = recode(explanatory, "exp_age_bin" = 'Age', "exp_gender" = 'Gender', "exp_race" = 'Race', "exp_ethnicity" = 'Ethnicity', "exp_profession" = 'Profession', "exp_already_vax" = 'Had COVID vaccine', "exp_flu_vax" = 'Had flu vaccine this year'),
      response = fct_relevel(response, "resp_safety", "resp_feel_safe_at_work", "resp_concern_safety", "resp_confidence_science",   "resp_trust_info", "resp_will_recommend"),
      response = recode(response, 
                        "resp_safety" = 'Based on my understanding, I believe the vaccine is safe', 
                        "resp_confidence_science" = 'I am confident in the scientific vetting process for the new COVID vaccines', 
                        "resp_feel_safe_at_work" = 'Getting the vaccine will make me feel safer at work', 
                        "resp_will_recommend" = 'I will recommend the vaccine to family, friends, and community members', 
                        "resp_trust_info" = 'I trust the information that I have received about the vaccines',
                        "resp_concern_safety" = 'I am concerned about the safety and side effects of the vaccine')
    )

# Creating the plot
covid_survey_summary_stats2 |>
ggplot(aes(x = mean, y = factor(explanatory_value))) +
  geom_point(size = 0.75) +
  geom_errorbarh(aes(xmin = low, xmax = high), height = 0.3) +
  facet_grid(
    rows = vars(explanatory), 
    cols = vars(response), 
    scales = "free_y",
    space = "free_y", 
    labeller = labeller(explanatory = label_wrap_gen(15), response = label_wrap_gen(15)) 
    ) +
  scale_x_continuous(breaks = 1:5) +
  labs(
    x = "Mean Likert score\n(Error bars range from 25th to 75th percentile)",
    y = NULL,
  ) +
  theme(
    strip.text = element_text(size = 6),
    strip.text.y = element_text(angle = 0),
    axis.text.y = element_text(size = 6),
    axis.text.x = element_text(size = 6),
    panel.spacing = unit(0, "lines"),
    panel.spacing.x = unit(0.3, "lines"),
    axis.title.x = element_text(size = 8),
    panel.grid = element_blank(),
    strip.background = element_rect(fill = "gray90", color = "black")
  )

This plot drastically differs from that created in question 3. Reducing the percentile range of the error bars by 15 percentile on each end (30 percentiles overall), has significantly shrunk the distribution of the data displayed in the error bar. This definitely impacts interpretations of the plot, as well, as error bars that once spanned the entire Likert scale are now much more concentrated around their respective means due to the changes made. This trend is especially visible in the faceted columns, “I believe the vaccine is safe”, and “I am concerned about side effects”, which now have much more concentrated error ranges, even when faceted by identity, as well. Interpreting both of these plots side-by-side changes interpretation of these specific columns as we now know more about the distribution of the data, as though we were looking at a violin plot. One interesting finding that remains unchanged between the two plots is that of non-binary third gender respondents’ answer to the “I believe the vaccine is safe” question, whose error bar is unchanged. This could be due to a low n, however. Overall, most of these error bars become shorter (or more concentrated), which tells us that these distributions are more concentrated between the 25-75 percentiles than the 10-90 percentiles. This makes complete sense given the Central Limit Theorem which tells us that as n > 30, distributions will converge to a normal distribution.

5 - COVID survey - another view

# Wrangling
# Subsetting data to needed variables
covid_q51 = subset(covid, select = -c(exp_profession, exp_flu_vax, exp_gender, exp_race, exp_ethnicity, exp_age_bin))

# Pivoting longer
covid_q51 <- covid_q51 |>
    pivot_longer(
    cols = starts_with("resp_"),
    names_to = "response",
    values_to = "response_value"
    ) |>
  filter(!is.na(exp_already_vax))

sapply(covid_q51, class)
    response_id exp_already_vax        response  response_value 
    "character"     "character"     "character"     "character" 
# Summary statistics
covid_q51$response_value <- as.numeric(as.character(covid_q51$response_value))

covid_q51_summary <- covid_q51 |>
  group_by(response, exp_already_vax) |>
  summarize(mean = mean(response_value, na.rm = TRUE))

# Converting "No" means to negatives
covid_q51_summary <- covid_q51_summary |>
  mutate(mean_adj = ifelse(exp_already_vax == "No", -mean, mean))

# Relabeling response
covid_q51_summary <- covid_q51_summary |>
  mutate(
          response = recode(response, 
                        "resp_safety" = 'Based on my understanding, I believe the vaccine is safe', 
                        "resp_confidence_science" = 'I am confident in the scientific vetting process for the new COVID vaccines', 
                        "resp_feel_safe_at_work" = 'Getting the vaccine will make me feel safer at work', 
                        "resp_will_recommend" = 'I will recommend the vaccine to family, friends, and community members', 
                        "resp_trust_info" = 'I trust the information that I have received about the vaccines',
                        "resp_concern_safety" = 'I am concerned about the safety and side effects of the vaccine')
    )
covid_q51_summary$response_wrap = str_wrap(covid_q51_summary$response, width = 45) # Source: https://stackoverflow.com/questions/21878974/wrap-long-axis-labels-via-labeller-label-wrap-in-ggplot2

# Plot
covid_q51_summary |>
  ggplot(aes(x = mean_adj, y = reorder(response_wrap, -mean_adj), fill = exp_already_vax)) +
  geom_col(position = "identity", width = 0.6) +
  labs(
    y = NULL, 
    x = "Mean response",
    title = "Already-vaccinated students more confident\nabout the safety of COVID-19 vaccine",
    caption = "Source: Shah et al. | Johns Hopkins School of Medicine",
    subtitle = "On a Likert scale from 1 (strongly agree) to 5 (strongly disagree)"
  ) +
  scale_x_continuous(
    labels = abs,
    limits = c(-5, 5),
    breaks = c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5)
    ) +
  annotate(
    geom = "text",
    x = 0,
    y = Inf,
    label = "Not already vaccinated  ",
    size = 3,
    color = "#E69F00",
    hjust = 1
  ) +
   annotate(
    geom = "text",
    x = 0,
    y = Inf,
    label = "  Already vaccinated ",
    size = 3,
    color = "#56B4E9",
    hjust = 0
  ) +
  scale_fill_manual(values=c("#E69F00","#56B4E9")) +
  theme(
    panel.grid.minor.x = element_blank(),
    legend.position = "none",
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(size = 8),
    plot.caption = element_text(size = 6),
    axis.text.y = element_text(size = 7),
    plot.title.position = "plot",
    plot.subtitle = element_text(size = 10)
  ) +
  coord_cartesian(clip = "off")

Alt text

  • The figure is a diverging bar plot titled, “Already-vaccinated students more confident in safety of COVID-19 vaccine” and depicts responses to six statements measuring perceptions of the vaccine and its safety using a Likert scale where 1 indicates strong agreement and 5 strong disagreement. Mean responses to six statements are shows, the statements being “Based on my understanding, I believe the vaccine is safe,” “I am confident in the scientific vetting process for the new COVID vaccines,” “Getting the vaccine will make me feel safer at work,” “I will recommend the vaccine to family, friends, and community members,” “I trust the information that I have received about the vaccines,” and “I am concerned about the safety and side effects of the vaccine.” The bars diverge conditioned on whether or not the respondent had already received the vaccine at the time of surveying and there is a clear trend that shows that those already vaccinated agree with all statements more than those who were unvaccinated (with the exception of “I am concerned about the safety and side effects of the vaccine,” which is phrased inversely and for this statement the trend is indirect). More specifically, for all statements except the inverted one, already-vaccinated respondents replied with an approximate of 1.5 mean Likert scale, and those unvaccinated on average replied around 3.2. The data yields from Shah et al. from Johns Hopkins School of Medicine.
# Wrangling
# Subsetting data to needed variables
covid_q52 = subset(covid, select = -c(exp_profession, exp_flu_vax, exp_race, exp_already_vax, exp_ethnicity, exp_age_bin))

# Pivoting longer
covid_q52 <- covid_q52 |>
    pivot_longer(
    cols = starts_with("resp_"),
    names_to = "response",
    values_to = "response_value"
    ) |>
  filter(!is.na(exp_gender))

sapply(covid_q52, class)
   response_id     exp_gender       response response_value 
   "character"    "character"    "character"    "character" 
# Destring IV
covid_q52$response_value <- factor(covid_q52$response_value, levels = 5:1)

# Summarizing
covid_q52_summary <- covid_q52 |>
  group_by(response, response_value) |>
  summarize(n = n(), .groups = "drop") |>
  group_by(response) |>
  mutate(percent = n / sum(n))


# Relabeling response
covid_q52_summary <- covid_q52_summary |>
  mutate(
          response = recode(response, 
                        "resp_safety" = 'Based on my understanding, I believe the vaccine is safe', 
                        "resp_confidence_science" = 'I am confident in the scientific vetting process for the new COVID vaccines', 
                        "resp_feel_safe_at_work" = 'Getting the vaccine will make me feel safer at work', 
                        "resp_will_recommend" = 'I will recommend the vaccine to family, friends, and community members', 
                        "resp_trust_info" = 'I trust the information that I have received about the vaccines',
                        "resp_concern_safety" = 'I am concerned about the safety and side effects of the vaccine')
    )


# Set wrap
covid_q52_summary$response_wrap = str_wrap(covid_q52_summary$response, width = 45) # Source: https://stackoverflow.com/questions/21878974/wrap-long-axis-labels-via-labeller-label-wrap-in-ggplot2

# Plot
covid_q52_summary |>
  filter(!is.na(response_value)) |>
ggplot(aes(x = percent, y = reorder(response_wrap, -percent), fill = response_value)) +
  geom_col(position = "fill", width = 0.7) +
    labs(
    y = NULL,
    x = "Percent of responses",
    title = "Majority of students confident about the \nsafety of the COVID-19 vaccine",    
    caption = "Source: Shah et al. | Johns Hopkins School of Medicine",
    subtitle = "On a Likert scale from 1 (strongly agree) to 5 (strongly disagree)"
  ) +
  scale_fill_manual(breaks = c('1', '2', '3', '4', '5'), values = c("#CC79A7", "#56B4E9", "#009E73", "#F0E442", "#E69F00")) +
  scale_x_continuous(limits = c(0, 1), breaks = c(0, 0.2, 0.4, 0.6, 0.8, 1),labels = scales::percent_format()) +
  theme(
    panel.grid.minor.x = element_blank(),
    legend.position = "right",
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(size = 8),
    plot.caption = element_text(size = 6),
    axis.text.y = element_text(size = 7),
    plot.title.position = "plot",
    plot.subtitle = element_text(size = 10),
    legend.title = element_blank(),
    legend.key.size = unit(0.3, 'cm'), # Source: https://www.statology.org/ggplot2-legend-size/
    legend.text = element_text(size = 8)
  )

Alt text

  • The figure is a stacked bar plot titled, “Majority of students confident about the safety of the COVID-19 vaccine” and depicts distribtuion of Likert scale replies in response to six statements measuring perceptions of the COVID-19 vaccine. The Likert scale is scaled from 1, indicating strong agreement, to 5, showing strong disagreement. For the statements, “I will recommend the vaccine to family, friends, and community members,” “I trust the information that I have received about the vaccines,” “I am confident in the scientific vetting process for the new COVID vaccines,” and “Getting the vaccine will make me feel safer at work,” the percentage of students who strongly or somewhat agree hovers around 90 to 92%. For the statement, “Based on my understanding, I believe the vaccine is safe,” this agreement is lower, around 75%. Finally, for the statement, “I am concerned about the safety and side effects of the vaccine,” agreement is around 35%, while disagreement is around 55%. The data yields from Shah et al. from Johns Hopkins School of Medicine.

Plot comparison

  • These plots depict aligned stories, although through different means. Both plots show us that most of the student respondents from this survey felt confident about the vaccine and its safety, as seen in the high rates of strong agreement with all five of the positively framed statements here and moderate disagreement with the negatively framed statement. However, these plots adopt different approaches and provide us with different details. For example, the diverging bar plot shows us another variable, whether or not the respondent had already been vaccinated, revealing interesting correlations with confidence in the vaccine. However, this plot only shows us the mean Likert response for each statement for each condition group. Contrastingly, the stacked bar plot shows us more of the distribution of Likert responses (1 through 5) for each of the six statements, but does not incorporate the added complexity of an additional variable. The distribution provides interesting details, though, such as the distribution of agreement responses to the statement, “Based on my understanding, I believe the vaccine is safe,” which does not follow the same trend as the other positively framed statements and yields less agreement.